import os
#代码中设置USER_AGENT，设置USER_AGENT的代码一定要放在WebBaseLoader这个包前面，不然还是会报错
os.environ['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 14.0; Win64; x64) AppleWebKit/567.36 (KHTML, like Gecko) Chrome/58.0.444.11 Safari/337.3'

from langchain_community.document_loaders import WebBaseLoader
#警告日志信息: USER_AGENT environment variable not set, consider setting it to identify your requests.

# 初始化加载器，传入目标URL列表（可多个）
urls = ["https://www.cnblogs.com"]
loader = WebBaseLoader(urls)

docs = loader.load()

#查看结果
print(f"提取文本长度:{len(docs[0].page_content)}字符")
print(f"前200个字符:\n{docs[0].page_content[:200]}...")
print(f"元数据:{docs[0].metadata}")